# load packages
library(quanteda)
library(tidytext)
library(tidyverse)
library(corrplot)
# devtools::install_github("rkabacoff/factorAnalysis")
library(factorAnalysis)
library(psych)
library(countrycode)
library(WDI)
library(rio)
library(democracyData)
library(sjlabelled)

# import the reports
reports <- readRDS("reports.RDS")

# import the tokens
toks <- readRDS("tokens.RDS")

# dictionary analysis
### lsd
toks_lsd <- tokens_lookup(toks, dictionary = data_dictionary_LSD2015[1:2])
dfm_lsd <- dfm(toks_lsd)
reports$LSD <- as.numeric((dfm_lsd[ , 1] - dfm_lsd[ , 2]) / ntoken(dfm(toks)))
reports$LSD2 <- as.numeric(log((dfm_lsd[ , 1] + 0.5) / (dfm_lsd[ , 2] + 0.5)))

### afinn
afinn <- get_sentiments("afinn")
dict_afinn <- dictionary(list(
  negative = afinn$word[afinn$value < 0],
  positive = afinn$word[afinn$value > 0]
))
toks_afinn <- tokens_lookup(toks, dictionary = dict_afinn)
dfm_afinn <- dfm(toks_afinn)
reports$AFINN <- as.numeric((dfm_afinn[ , 1] - dfm_afinn[ , 2]) / ntoken(dfm(toks)))
reports$AFINN2 <- as.numeric(log((dfm_afinn[ , 1] + 0.5) / (dfm_afinn[ , 2] + 0.5)))

### bing
bing <- get_sentiments("bing")
dict_bing <- dictionary(list(
  negative = bing$word[bing$sentiment == "negative"],
  positive = bing$word[bing$sentiment == "positive"]
))
toks_bing <- tokens_lookup(toks, dictionary = dict_bing)
dfm_bing <- dfm(toks_bing)
reports$bing <- as.numeric((dfm_bing[ , 1] - dfm_bing[ , 2]) / ntoken(dfm(toks)))
reports$bing2 <- as.numeric(log((dfm_bing[ , 1] + 0.5) / (dfm_bing[ , 2] + 0.5)))

#### nrc
nrc <- get_sentiments("nrc")
dict_nrc <- dictionary(list(
  negative = nrc$word[nrc$sentiment == "negative"],
  positive = nrc$word[nrc$sentiment == "positive"]
))
toks_nrc <- tokens_lookup(toks,dictionary = dict_nrc)
dfm_nrc <- dfm(toks_nrc)
reports$nrc <- as.numeric((dfm_nrc[ , 1] - dfm_nrc[ , 2]) / ntoken(dfm(toks)))
reports$nrc2 <- as.numeric(log((dfm_nrc[ , 1] + 0.5) / (dfm_nrc[ , 2] + 0.5)))

reports_tidy <- reports %>%
  unnest_tokens(word, text)

lsd2 <- tibble(word = unlist(data_dictionary_LSD2015[1:2]),
               sentiment = c(rep("negative", length(unlist(data_dictionary_LSD2015[1]))),
                             rep("positive", length(unlist(data_dictionary_LSD2015[2])))))

lsd_word_counts <- reports_tidy %>%
  inner_join(lsd2) %>%
  count(word, sentiment, sort = TRUE)

lsd_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  scale_fill_brewer(palette = "Set1") +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL) +
  theme_bw()
ggsave("Figures/fig_b1.jpg", height = 3.5, width = 7)

afinn2 <- get_sentiments("afinn") %>%
  transmute(word = word, sentiment = ifelse(value > 0, "positive", "negative"))

afinn_word_counts <- reports_tidy %>%
  inner_join(afinn2) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

afinn_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  scale_fill_brewer(palette = "Set1") +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL) +
  theme_bw()
ggsave("Figures/fig_b2.jpg", height = 3.5, width = 7)

bing_word_counts <- reports_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  scale_fill_brewer(palette = "Set1") +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL) +
  theme_bw()
ggsave("Figures/fig_b3.jpg", height = 3.5, width = 7)

nrc2 <- get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative"))
nrc_word_counts <- reports_tidy %>%
  inner_join(nrc2) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

nrc_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  scale_fill_brewer(palette = "Set1") +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL) +
  theme_bw()
ggsave("Figures/fig_b4.jpg", height = 3.5, width = 7)

# description
reports %>% select(country_name, year, organization, LSD, AFINN, bing, nrc) %>%
  pivot_longer(c(LSD, AFINN, bing, nrc), names_to = "dictionary", values_to = "negativity") %>%
  group_by(year, dictionary, organization) %>% summarise(
    negativity = mean(negativity)
  ) %>% ggplot() +
  geom_col(aes(year, negativity, fill = organization), position = "dodge") +
  labs(x = "Year", y = "Mean Negativity", fill = "Organization") +
  facet_wrap(~ dictionary, ncol = 1, scales = "free_y") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw() +
  theme(legend.position="bottom")
ggsave("Figures/fig2.jpg", height = 5, width = 7)

validation <- reports %>% select(LSD, AFINN, bing, nrc)

# correlations (fig C1)
jpeg("Figures/fig_c1.jpg")
corrplot.mixed(cor(validation), order = 'AOE', number.cex = 1.5, tl.cex = 1.5, cl.cex = 1.25)
dev.off()

# PCA (fig C2)
screePlot(validation) +
  theme(plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm"), plot.title = element_blank(),
        plot.subtitle = element_blank(), plot.caption = element_blank())
ggsave("Figures/fig_c2.jpg", width = 5, height = 3)

pc <- principal(validation, nfactors = 1, scores = TRUE)
pc

reports$shaming <- pc$scores[ , 1]

validation2 <- reports %>% select(LSD2, AFINN2, bing2, nrc2)
pc2 <- principal(validation2, nfactors = 1, scores = TRUE)
reports$shaming2 <- pc2$scores[ , 1]

# visualize the shaming scores of the Amnesty International reports and US Department of State reports (fig 3)
reports %>% mutate(decade = case_when(
  year < 1990 ~ "1980s",
  year > 1989 & year < 2000 ~ "1990s",
  year > 1999 ~ "2000s"
)) %>% ggplot() +
  geom_point(aes(decade, shaming, color = organization), position = "jitter") +
  labs(x = "Decade (with jitter)", y = "Shaming Score", color = "Organization") +
  scale_color_brewer(palette = "Set1") +
  theme_bw() +
  theme(legend.position = "bottom")
ggsave("Figures/fig3.jpg", height = 3.5, width = 7)

# construct tidy data
data <- tibble(
  country_cow_code = rep(unique(reports$country_cow_code), each = length(1981:2010)),
  year = rep(1981:2010, length(unique(reports$country_cow_code)))
)

# import country code
countries <- codelist_panel %>% select(year, cown, iso2c)

# import the media freedom data
media <- read_csv("GMFD_V2.csv") %>% select(-country)

# import the GDP per capita and population data from World Bank
wb_data <- WDI(indicator = c("gdppc" = "NY.GDP.PCAP.KD",
                             "pop" = "SP.POP.TOTL"),
               start = 1981, end = 2010)

# import the civil war data
civwar <- import("Civil War.dta") %>%
  select(country_cow_code = ccode, year, civwar) %>%
  mutate(civwar = as.integer(civwar))

# The Geddes Wright and Frantz Autocratic Regimes dataset
gwf <- gwf_all %>% mutate(democracy = ifelse(is.na(gwf_regimetype), 1, 0)) %>%
  select(country_cow_code = gwf_cowcode, year, democracy)

# merge the data
ai_shaming <- reports %>% filter(organization == "Amnesty International") %>%
  mutate(ai_shaming = shaming, ai_shaming2 = shaming2) %>% dplyr::select(country_cow_code, year, ai_shaming, ai_shaming2)
us_shaming <- reports %>% filter(organization == "State Department") %>%
  mutate(us_shaming = shaming, us_shaming2 = shaming2) %>% dplyr::select(country_cow_code, year, us_shaming, us_shaming2)
data <- data %>% left_join(ai_shaming, by = c("country_cow_code", "year")) %>%
  left_join(us_shaming, by = c("country_cow_code", "year")) %>%
  left_join(distinct(reports[ , c("country_cow_code", "year", "fariss.mean")]), by = c("country_cow_code", "year")) %>%
  left_join(countries, by = c("year", "country_cow_code" = "cown")) %>%
  left_join(media, by = c("year", "country_cow_code" = "ccode")) %>%
  left_join(wb_data, by = c("year", "iso2c")) %>%
  left_join(civwar, by = c("year", "country_cow_code")) %>%
  left_join(gwf, by = c("year", "country_cow_code"))

# generate lead dependent variable
data <- data %>% group_by(country_cow_code) %>%
  mutate(fariss_next = lead(fariss.mean),
         ln_pop = log(pop),
         ln_gdppc = log(gdppc)) %>% ungroup()

# label the variables
data <- data %>%
  var_labels(fariss.mean = "Lagged HRS",
             fariss_next = "Human Rights Score",
             ai_shaming = "AI Shaming",
             ai_shaming2 = "AI Shaming",
             us_shaming = "US Shaming",
             us_shaming2 = "US Shaming",
             ln_pop = "log Population",
             ln_gdppc = "log GDP Per Capita",
             mediascore = "Media Freedom Score",
             democracy = "Democracy",
             civwar = "Civil War")

# save the data
saveRDS(data, "data.RDS")